org 100h   ; assume ax=bx=0 cx=0xff

;UNPACK:   ; ax=0xd60f bx=0x158 cx=0 si=0x1d3 di=0x34d

;Prepare a table of powers of 4 (for SSE)
;[0xfff0]=0x00000000 [0xffe0]=0x01000000 ... [0xf000]=0xff000000
;[0xeff0]=0x00000000 [0xefe0]=0x01000000 ... [0xe000]=0xff000000
;                                            ...
;                                            [0x8000]=0xff000000

  pop di     ; 275
PREP:
  imul ax,sp,-16
  mov al,0  ; clear low byte: store each value 4 times
  push ax
  push di
  test sp,sp
  js PREP ; sp=0x7ffc ax=cx=0

;  pop di     ; 275
;  mov ch,8
;PREP:
;  imul ax,cx,0x40
;  mov al,0  ; al=0
;  push ax
;  push di
;  loop PREP ; sp=0x8000 ax=cx=0

;  pop di     ; di=0 (needs to be 0 mod 4)
;  mul cx     ; ax=dx=0
;PREP:
;  sub dx,0x40; store each constant four times
;  mov ah,dh
;  push ax
;  push di
;  jnz PREP   ; ah=cx=0 sp=0xe000


;  pop di     ; di=0 (needs to be 0 mod 4)
;  mov ch,4
;PREP:
;  imul ax,cx,4
;
;  loop PREP


  mov al,13h
  int 10h
  fninit
  fild word[si]     ;| t=t0

;Centering segments for the 0xcccd trick: mov ax,0xcccd | mul pixel_address | add dx,segment
; segment=0x9f??: error in pixels = (segment*16-0xa0000 - ((x&0xff)-128)/256*320) % 320
; nice values:
;   0x9ff5 -2.25
; ->0x9fe0 +8       choose this one because it's divisible by 0x10
;   0x9fdf -6.75
;   0x9fca +3.5
;   0x9fb4 -1
; ----------------- 0x9fa0 is the lowest segment that can access the whole screen
;   0x9f9e -5.5
;   0x9f89 +4.75
;   0x9f73 +0.25
  mov si,0x9fe0
  mov es,si

;Palette: luminance * rainbow
;  mov dx,3c8h
;  xor ax,ax
;  out dx,al
;  inc dx
;PAL:
;  or ax,0b1100001111000011  ; bx = 11LLLL11 11HHHH11
;  push ax
;Q out dx,al
;  imul ah
;  shr ax,7
;  loop Q
;  pop ax
;  inc ax
;  jnz PAL

;si=9fe0
;bx=0480: bx+si=a460
;bx=0420: bx+si=a400
%define K(x) 0xa000 + (((~x)&0xff00) >> 4)

%define K_TIME_DELTA        si-0x9fe0+bx-0x480+K(0xbc00)  ;a430; -1/128

%define K_D0                K(0x4f00)  ;ab00; 2**31
%define K_NEG_D0            si-0x9fe0+bp-0x300+K(0xcf00)  ;a300; -2**31

;%define K_EPS               K(0x3c80)  ; 0x3c60=0.875/64, 0x3c80=1/64
;%define K_BRIGHT_MAGIC      K(0x4640)  ; ulp=EPS/16 = 1/1024 => magic=1.5*2^13
;%define K_EPS               K(0x3ce0)  ; 0x3ce0=0.875/32
%define K_NEG_EPS           si-0x9fe0+bp-0x300+K(0xcc00)  ;a330; 1/64 * 2**31
;%define K_BRIGHT_MAGIC      K(0x46c0)  ; ulp=EPS/16 = 1/512 => magic=1.5*2^14

;%define K_BRIGHT_MAGIC      K(0x4740)  ; ulp=EPS/8 = 1/512 => magic=1.5*2^15
;%define K_BRIGHT_MAGIC      K(0x56c0)  ; ulp=EPS/8 = 1/512 * 2**31 => magic=1.5*2^(15+31)
%define K_BRIGHT_MAGIC      si-0x9fe0+bp-0x300+K(0xd700)  ;a280; ulp=EPS/8 = 1/512 * 2**31 => magic=2.0*2^(15+31)

%define K_HUE_MAGIC         si-0x9fe0+bp-0x300+K(0xd900)  ;a260; ulp=2/8 * 2**31 = 1/16 * 2**31  => magic=2.0*2^(19+31)
;%define K_HUE_MAGIC         K(0x58c0)  ; ulp=2/8 * 2**31 = 1/16 * 2**31  => magic=1.5*2^(19+31)
;%define K_HUE_MAGIC         K(0x48c0)  ; ulp=2/16   = 1/32  => magic=1.5*2^18

;%define K_NEG_ABS           K(0x8000)  ; -0: sign bit for -abs
;%define K_RCP_SCALE         si-0x9fe0+K(0x4080)  ; 4 (for rcpps)
;%define K_SCALE             K(0x3e80)  ; 1/4

%define K_TRANSLATION       si-0x9fe0+bp-0x300+K(0xce00)  ;a310; -0.25 * 2**31

;%define K_NEG_2             si-0x9fe0+bx-0x480+K(0xc000)  ; -2
;%define K_NEG_HALF_SCALE    si-0x9fe0+bx-0x480+K(0xbe00)  ; -1/8
%define K_NEG_2             si-0x9fe0+bx-0x480+K(0xc000)  ;a3f0; -2
%define K_NEG_HALF_SCALE    si-0x9fe0+bx-0x480+K(0xbe00)  ;a410; -1/8


;For 16:9 screens: pixel aspect ratio = 1.03
;%define K_X_SCALE           K(0x3020)  ; 2.5 * 2**-32: x -> ..1.25
;%define K_Y_SCALE           K(0x2fe0)  ; 1.75 * 2**-32: y -> ..0.6836

;For 4:3 screens: pixel aspect ratio = 0.96
;%define K_X_SCALE      K(0x3f80)  ; 1.0    ; 2.0 * 2**-32: x -> ..1.0
;%define K_Y_SCALE      K(0x3f80)  ; 1.0    ; 2.0 * 2**-32: y -> ..0.7813

;For each frame: prepare rotation constants
M fld st0
  fsincos          ;| C1 S1 t
  fldl2e
  fmul st3         ;| 1.44*t C1 S1 t
  fsincos          ;| C2 S2 C1 S1 t
  fldlg2
  fmul st5         ;| 0.30*t C2 S2 C1 S1 t
  fsincos          ;| C3 S3 C2 S2 C1 S1 t

;Store each constant four times
  mov bx,0x420     ; bh=4
STORE:
  mov cl,4
STORE4:
  fst dword[bx]    ;0x400 10 20 30 40 50 60 70 80
  add bl,bh        ;         C3 S3 C2 S2 C1 S1 XY, scratch, pixel data
  loop STORE4
  fstp st0
  jns STORE        ; loop 6 times: bx=0x480

  fadd dword[K_TIME_DELTA] ;| t+=dt

%define COS bx
%define SIN bx+0x10

;For each 4-pixel batch:
X mov cl,4         ; bx=0x480

;Combine brightness and hue from the last batch
B shr bp,1       ; background mask

;  sbb ax,ax
;  and ah,[bx]      ; brightness
;  jz Z             ; 0 (background)
;  mov al,[bx+si]   ; hue
;  aad 26           ; al += 26*ah
;Z stosb

  salc
  and al,[bx]    ; brightness (-1..-16) or 0 (background)
  jz Z
  add al,[bx+si]
  xor al,0xef    ; al = 15-al for al in -1..-16
Z stosb

;Store coordinates for this batch
  mov ax,0xcccd
  mul di
  add dx,si      ; 0xcccd*pixel_address + 0x9fe00000: center X and Y

;  ; 10 bytes, free si
;  mov [bx],ax
;  mov [bx+2],dx
;  add bl,bh
;%define INT_X bx-1     ; x = 2^32 * (-0.5..0.5)
;%define INT_Y bx       ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

  ; 10 bytes, free bh
  inc bx
  mov [bx],ax     ; 0x0480: X = dl:ah:al:__
  inc bx          ;             +3 +2 +1 +0
  mov [bx+si],dx  ; 0xa460: Y = dh:dl:__:__
  inc bx
  mov [bx],dl
  inc bx
%define INT_X bx       ; x = 2^32 * (-0.5..0.5)
%define INT_Y bx+si    ; y = 0xcccd * 320 * (-100..100) = 2^32 * (-0.3906..0.3906)

  loop B         ; di+=4 bx=0x490
  dec di
  mov bl,0x80

%define x xmm0 ; XYZ coordinates in the fractal iteration
%define y xmm1
%define z xmm2
%define o xmm3 ; output: orbit trap
%define a xmm4 ; scratch, output: estimated distance
%define b xmm5 ; scratch
%define c xmm6 ; translation [-c,-c/4,0]
%define d xmm7 ; depth (camera Z)

;Trace steps along a ray
  mov bp,0xa2e0-0x9fe0+0x5000   ; 0x5300
  mov cl,24
  movaps d,[K_D0]; d=1
T call MAP
  addps d,a     ; d += -map(X,Y,d)
  loop T

;Compute Normal.Z (scaled by ambient occlusion)
  movaps [si],a    ; store last step
  subps d,[K_NEG_EPS]
  subps d,a     ; undo last "addps d,a"
  call MAP      ; a = -map(X,Y,d+EPS)
  subps a,[si]     ; a = -(map(X,Y,d+EPS) - map(X,Y,d))

;Clip by the far plane, reject normals pointing away
  subps d,[K_NEG_D0]; d-=1
  andnps d,a

;Store brightness and hue
  addps a,[K_BRIGHT_MAGIC] ; shift the value into the lowest float byte
  addps o,[K_HUE_MAGIC]
  movaps [bx],a    ; 0x0480
  movaps [bx+si],o ; 0xa440

  movmskps ebp,d ; a<0 and d>=0? ok : 0 (background or grazing hit)

;Next pixel
  inc di
  jnz X   ; ax=di=0

;Esc test, next frame
  in al,0x60
  dec ax  ; ah=0 from the last "mul di"
  jnz M   ; fallthrough

;Return the box distance to the KIFS fractal
MAP:            ; bx=0x480
;  movups x,[INT_X]
;  cvtdq2ps x,x
;  cvtdq2ps y,[INT_Y]
  cvtdq2ps x,[INT_X]
  cvtdq2ps y,[INT_Y]
;  mulps x,[K_X_SCALE]
;  mulps y,[K_Y_SCALE]

  movaps c,[K_TRANSLATION] ; c=-1/4: translation=[-c,-c/4,0]
  movaps o,c    ; o=-1/4
  movaps z,d

;Rotate in the XZ, YX and ZY planes
L mov bl,0x20   ; ch=0 on init
R movaps b,[COS]; b=C3 a=S3 | b=C2 a=S2 | b=C1 a=S1
  movaps a,[SIN]
  mulps b,z     ; b=Cz
  mulps z,a     ; z=Sz
  mulps a,x     ; a=Sx
  mulps x,[COS] ; x=Cx
  subps a,b     ; a=x'=Sx-Cz
  addps z,x     ; z=z'=Sz+Cx
  movaps x,y    ; cycle x,y,z <- y,z,a
  movaps y,z
  movaps z,a
  add bl,0x20   ; 0x20 | 0x40 | 0x60
  jns R         ; bx=0x480 a=z

;Reflect along X and Y
  movaps b,[K_NEG_2]
  orps x,b      ; x=-|x|
  orps y,b      ; y=-|y|

;Box-distance (L_inf) to the origin
  orps a,b      ; a=-|z|
  minps a,x
  minps a,y     ; a=-length = min(-|x|,-|y|,-|z|)

;Orbit trap
  minps o,a     ; orbit=min(orbit,-length)

;Translate by [-c,-c/4,0]
  mulps b,[K_NEG_HALF_SCALE]  ; b=0.25  -2 * -0.125 = 0.25
  mulps b,c     ; b=c/4
  subps x,c     ; x-=c
  subps y,b     ; y-=c/4

;Scale translation
  subps c,b     ; c-=c/4 (c*=3/4)

;Next iteration
  add ch,0x10   ; 16 iterations
  jnz L

  subps a,c
  subps a,c     ; a=-(length-2c)
  ret           ; bx=0x480
